{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sm_50', 'sm_60', 'sm_61', 'sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_90']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import importlib,sys \n",
    "importlib.reload(sys)\n",
    "import warnings\n",
    "# warnings.filterwarnings('ignore')\n",
    "import torch\n",
    "torch.cuda.get_arch_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Thu Jun 13 15:09:11 2024       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 551.76                 Driver Version: 551.76         CUDA Version: 12.4     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |\n",
      "| N/A   50C    P8              2W /   50W |       0MiB /   8188MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A     16112    C+G   D:\\chat\\QQ\\QQ.exe                           N/A      |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matrix_add.cu\n",
      "tmpxft_00007e18_00000000-10_matrix_add.cudafe1.cpp\n",
      "  正在创建库 martix_add.lib 和对象 martix_add.exp\n",
      "GPU time: 0.849696 ms\n",
      "CPU time: 4.000000 ms\n",
      "加速比: 4.707566\n"
     ]
    }
   ],
   "source": [
    "!nvcc  -arch=sm_70 .\\matrix_add.cu -o martix_add.exe\n",
    "!.\\martix_add.exe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matrix_add_2.cu\n",
      "tmpxft_00009870_00000000-10_matrix_add_2.cudafe1.cpp\n",
      "  正在创建库 martix_add_2.lib 和对象 martix_add_2.exp\n",
      "GPU time: 1.393216 ms\n",
      "CPU time: 14.000000 ms\n",
      "加速比: 10.048693\n"
     ]
    }
   ],
   "source": [
    "!nvcc .\\matrix_add_2.cu -o martix_add_2.exe\n",
    "!.\\martix_add_2.exe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gpuinfo.cu\n",
      "tmpxft_00001058_00000000-10_gpuinfo.cudafe1.cpp\n",
      "c:\\Users\\Senhai Xu\\Desktop\\High Performance Computing Course\\gpuinfo.cu(24): warning C4477: “printf”: 格式字符串“%ld”需要类型“long”的参数，但可变参数 1 拥有了类型“size_t”\n",
      "c:\\Users\\Senhai Xu\\Desktop\\High Performance Computing Course\\gpuinfo.cu(24): note: 请考虑在格式字符串中使用“%zd”\n",
      "c:\\Users\\Senhai Xu\\Desktop\\High Performance Computing Course\\gpuinfo.cu(37): warning C4477: “printf”: 格式字符串“%ld”需要类型“long”的参数，但可变参数 1 拥有了类型“size_t”\n",
      "c:\\Users\\Senhai Xu\\Desktop\\High Performance Computing Course\\gpuinfo.cu(37): note: 请考虑在格式字符串中使用“%zd”\n",
      "  正在创建库 gpuinfo.lib 和对象 gpuinfo.exp\n",
      "使用GPU设备 0: NVIDIA GeForce RTX 4060 Laptop GPU\n",
      "SM的数量：24\n",
      "CUDA CORE数量：3072\n",
      "计算能力（Compute Capability）：8.9\n",
      "显存大小：8.00 GB\n",
      "每个线程块的共享内存大小：48.00 KB\n",
      "每个线程块的最大线程数：1024\n",
      "每个线程块的最大共享内存（字节）：49152\n",
      "每个warp的线程数：32\n",
      "每个SM的最大warp数：48\n",
      "每个SM的最大线程块数：24（近似值，因为线程块大小可变）\n",
      "每个SM的最大线程数：1536\n",
      "最大线程块大小：(1024, 1024, 64)\n",
      "每个SM的寄存器数：65536\n",
      "每个线程块的最大寄存器数：65536\n",
      "每个线程的最大寄存器数：42\n",
      "每个SM的共享内存（字节）：102400\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!nvcc .\\gpuinfo.cu -o gpuinfo.exe\n",
    "!.\\gpuinfo.exe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matrix_multiply.cu\n",
      "tmpxft_00009ad0_00000000-10_matrix_multiply.cudafe1.cpp\n",
      "  正在创建库 matrix_multiply.lib 和对象 matrix_multiply.exp\n",
      "7168.000000 \n",
      "\n",
      "7168.000000 \n",
      "\n",
      "GPU time: 21.199520 ms\n",
      "CPU time: 13432.000000 ms\n",
      "加速比: 633.599248\n"
     ]
    }
   ],
   "source": [
    "!nvcc .\\matrix_multiply.cu -o matrix_multiply.exe\n",
    "!.\\matrix_multiply.exe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matrix_multiply_2.cu\n",
      "tmpxft_00003fcc_00000000-10_matrix_multiply_2.cudafe1.cpp\n",
      "  正在创建库 matrix_multiply_2.lib 和对象 matrix_multiply_2.exp\n",
      "7161.000000 \n",
      "\n",
      "7161.000000 \n",
      "\n",
      "GPU time: 21.207457 ms\n",
      "CPU time: 12161.000000 ms\n",
      "加速比: 573.430385\n"
     ]
    }
   ],
   "source": [
    "!nvcc .\\matrix_multiply_2.cu -o matrix_multiply_2.exe\n",
    "!.\\matrix_multiply_2.exe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "matrix_multiply_3.cu\n",
      "tmpxft_000043b8_00000000-10_matrix_multiply_3.cudafe1.cpp\n",
      "  正在创建库 matrix_multiply_3.lib 和对象 matrix_multiply_3.exp\n",
      "7161.000000 \n",
      "\n",
      "7161.000000 \n",
      "\n",
      "GPU time: 16.241089 ms\n",
      "CPU time: 11649.000000 ms\n",
      "加速比: 717.254865\n"
     ]
    }
   ],
   "source": [
    "!nvcc .\\matrix_multiply_3.cu -o matrix_multiply_3.exe\n",
    "!.\\matrix_multiply_3.exe"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch_cuda",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
